# __________Packages__________
import pandas as pd
import collections
import random
import sys
import xgboost as xgb
import os
import time
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error,r2_score,explained_variance_score
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_auc_score
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import scipy.stats
import scikit_posthocs as sp
import seaborn as sns
import matplotlib.pyplot as plt
import shap
# _________Import regression models for task___________
# pip install rgf_python
# sys.path.insert(1, 'palobst')
from catboost import CatBoostRegressor
from rgf.sklearn import RGFRegressor
from optboosting.boosting import *
from palobst import *
# _______Path to folder with datasets_________
data_field='regressionDatasets'
datasets=os.listdir(data_field)
def label_encoder(df):
cat_mask =df.dtypes == object
cat_features=df.columns[cat_mask].tolist()
for feature in cat_features:
dict_map={}
values=df[feature].unique()
z=0
for i in values:
dict_map[i]=z
z+=1
df[feature]=df[feature].map(dict_map)
df=df.fillna(-999)
return df
params_PaloBst={'distribution':["gaussian"], 'learning_rate':np.arange(0.01,1,0.05),
'n_estimators':np.arange(100,500,50),'subsample':np.arange(0.1,1,0.1)}
params_RGFR={'l2':np.arange(0.01,1,0.05), 'max_leaf':range(1000,10000,200), 'learning_rate':np.arange(0.01,1,0.05)}
params_AcceleratedBoost={'descent':['proximal','gradient'], 'loss':['ls','lad'],
'learning_rate':np.arange(0.01,1,0.05),'n_estimators':np.arange(100,500,50)}
params_CatBoost = {'learning_rate':np.arange(0.05, 0.31, 0.05),'depth': [4, 6, 10],
'l2_leaf_reg': [1, 3, 5, 7, 9]}
def learning(alpha,cv=3,n_iter=50):
mse_scorer = make_scorer(MSE,greater_is_better=False)
if alpha==1:
model = PaloBst(distribution="gaussian")
model.warmup()
params=params_PaloBst
name='PaloBst'
elif alpha==2:
model = RGFRegressor(loss="LS", normalize=True)
params=params_RGFR
name='RGFR'
elif alpha==3:
model = OptBoosting(step=100)
params=params_AcceleratedBoost
name='AcceleratedBoost'
else:
model = CatBoostRegressor(silent=True)
params=params_CatBoost
name='CatBoost'
param={}
if name!='CatBoost':
RS = RandomizedSearchCV(model, params, cv=cv,n_iter=n_iter,scoring=mse_scorer)
start_train_time=time.time()
RS.fit(X_train, y_train)
training_time=time.time() - start_train_time
for key in list(params.keys()):
param[key]=RS.get_params()['estimator__'+key]
else:
start_train_time=time.time()
RSr = model.randomized_search(params_CatBoost, X_train, y_train, cv=cv, n_iter=n_iter, verbose=False)
training_time=time.time() - start_train_time
RS=model
param=model.get_params()
len_rows=X_test.shape[0]
# print(len_rows)
start_pred_time=time.time()
prediction=RS.predict(X_test)
inference_time=time.time() - start_pred_time
inference_time_1000=(inference_time/len_rows)*1000
# print(inference_time,inference_time_1000)
index1=MSE(y_test,prediction)
index2=mean_absolute_error(y_test,prediction)
index3=median_absolute_error(y_test,prediction)
index4=r2_score(y_test,prediction)
index5=explained_variance_score(y_test,prediction)
return(name,index1,index2,index3,index4,index5,training_time,inference_time_1000,param)
for dataset in datasets:
dataset_name=[]
alghotihm=[]
cross_validation=[]
hypermarametrs=[]
mse_value=[]
mean_abs_err_value=[]
median_abs_err_value=[]
r2_value=[]
exp_var_score=[]
train_time=[]
test_time=[]
df=pd.read_csv(data_field+'/'+dataset)
df=label_encoder(df)
y=df.iloc[:,-1]
X=df.drop(df.columns[len(df.columns)-1],axis=1)
for split in range(1,11):
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=random.randint(1,100))
X_train=X_train.reset_index(drop=True).to_numpy()
X_test=X_test.reset_index(drop=True).to_numpy()
y_train=y_train.reset_index(drop=True).to_numpy()
y_test=y_test.reset_index(drop=True).to_numpy()
for model in range(1,5):
name,index1,index2,index3,index4,index5,training_time,inference_time,param=learning(model)
dataset_name.append(dataset)
alghotihm.append(name)
train_time.append(training_time)
test_time.append(inference_time)
mse_value.append(index1)
mean_abs_err_value.append(index2)
median_abs_err_value.append(index3)
r2_value.append(index4)
exp_var_score.append(index5)
hypermarametrs.append(param)
cross_validation.append(split)
df=pd.DataFrame()
df['Dataset name']=dataset_name
df['Algorithm']=alghotihm
df['MSE']=mse_value
df['Mean absolute error']=mse_value
df['Median absolute error']=mse_value
df['R2 score']=mse_value
df['Explained variance']=mse_value
df['Training time']=train_time
df['Inference time']=test_time
df['Hyper-Parameters']=hypermarametrs
df['Cross Validation']=cross_validation
df.to_csv('output/{}.csv'.format(dataset),index=False)
path_output='output/'
df_list=[]
for df in os.listdir(path_output):
df_list.append(pd.read_csv(path_output+df))
df_final=pd.concat(df_list)
df_final.to_csv('results/table_of_results.csv',index=False)
df_final.to_excel('results/table_of_results.xlsx',index=False)
print('Datasets in dataframe:',df_final['Dataset name'].unique().shape[0])
df_final.head(5)
df_stat=df_final.groupby(['Dataset name','Algorithm'])['MSE'].mean().reset_index()
df_stat=df_stat.pivot(index='Dataset name',columns='Algorithm',values='MSE').reset_index()
df_stat.to_csv('results/table_for_test_friedman.csv',index=False)
df_stat.to_excel('results/table_for_test_friedman.xlsx',index=False)
df_stat.head(5)
min_values=df_stat.min(axis=1)
result_dict={}
for index,row in df_stat.iterrows():
data_name=row['Dataset name'].split('.')[0]
data_name=data_name.replace('-',"_")
if data_name=='Bike Sharing in Washington_kaggle':
data_name='Bike_Sharing_in_Washington_kaggle'
if row['AcceleratedBoost']==min_values[index]:
result_dict[data_name]='AcceleratedBoost'
elif row['CatBoost']==min_values[index]:
result_dict[data_name]='CatBoost'
elif row['PaloBst']==min_values[index]:
result_dict[data_name]='PaloBst'
elif row['RGFR']==min_values[index]:
result_dict[data_name]='RGFR'
c = collections.Counter(list(result_dict.values()))
number_wins=pd.DataFrame([(a,b) for a,b in c.items()])
number_wins.columns=['Algorithm','Number_wins']
number_wins=number_wins.sort_values(by='Number_wins',ascending=False)
display(number_wins)
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Algorithm', x='Number_wins',orient="h", data=number_wins, ax=ax)
df_final
df_time_tr=df_final.groupby(['Algorithm'])['Training time'].mean().reset_index().sort_values(by='Training time',ascending=False)
df_time_inf=df_final.groupby(['Algorithm'])['Inference time'].mean().reset_index().sort_values(by='Inference time',ascending=False)
print('Average training time')
display(df_time_tr)
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Algorithm', x='Training time',orient="h", data=df_time_tr, ax=ax)
print('Average infer time')
display(df_time_inf)
fig, ax = plt.subplots(figsize=(8,8))
sns.barplot(y='Algorithm', x='Inference time',orient="h", data=df_time_inf, ax=ax)
def test_statistic(df,labels,sign_level=0.05):
data=df[labels].to_numpy()
p_value=scipy.stats.friedmanchisquare(*data.T)[1]
if p_value<sign_level:
print('P-value ={}\nH0 is reject\nProvide Post Hoc Test'.format(p_value))
post_hoc=sp.posthoc_nemenyi_friedman(data)
post_hoc.columns=labels
post_hoc.index=labels
display(post_hoc)
cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
sp.sign_plot(post_hoc, **heatmap_args)
else:
print('P-value ={}\nH0 is not reject/n'.format(p_value))
test_statistic(df_stat,labels=['AcceleratedBoost','CatBoost','PaloBst','RGFR'])
def label_encoder(df):
cat_mask =df.dtypes == object
cat_features=df.columns[cat_mask].tolist()
for feature in cat_features:
dict_map={}
values=df[feature].unique()
z=0
for i in values:
dict_map[i]=z
z+=1
df[feature]=df[feature].map(dict_map)
return df
meta_df=pd.read_csv('meta/RegressionAll.csv')
meta_df['Alghorithm_name']=[['AcceleratedBoost','CatBoost','PaloBst','RGFR']]*meta_df.shape[0]
meta_df=meta_df.explode('Alghorithm_name')
meta_df['Result']=meta_df.apply(lambda x:1 if x['Alghorithm_name']==result_dict[x['name']] else 0,axis=1)
meta_df.to_csv('results/table_for_meta.csv',index=False)
meta_df.to_excel('results/table_for_meta.xlsx',index=False)
meta_df=label_encoder(meta_df)
meta_df.head(5)
names=meta_df['name'].unique()
roc_auc=[]
accuracy=[]
for name in names:
# _____train______
X=meta_df.drop('Result',axis=1)
y=meta_df['Result']
X_test=meta_df[meta_df['name']==name]
y_test=X_test['Result']
X_test=X_test.drop('Result',axis=1)
# ________test_________
X_train=meta_df[meta_df['name']!=name]
y_train=X_train['Result']
X_train=X_train.drop('Result',axis=1)
# _______model______
model=xgb.XGBClassifier( )
model.fit(X_train,y_train)
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = model.predict(X_test)
accuracy.append(accuracy_score(y_test, y_pred))
roc_auc.append(roc_auc_score(y_test, y_pred_prob))
average_roc=np.mean(roc_auc)
average_acc=np.mean(accuracy)
print('ROC AUC: {}'.format(average_roc))
print('Accuracy: {}\n'.format(average_acc))
print('Model parametrs: {}'.format(model.get_params()))
fig, ax = plt.subplots(1,1,figsize=(10,20))
xgb.plot_importance(model,ax=ax, importance_type='weight')
plt.title("xgboost feature importance")
plt.show()
fig, ax = plt.subplots(1,1,figsize=(10,20))
xgb.plot_importance(model,ax=ax, importance_type='gain')
plt.title("xgboost feature importance")
plt.show()
fig, ax = plt.subplots(1,1,figsize=(10,20))
xgb.plot_importance(model,ax=ax, importance_type='cover')
plt.title("xgboost feature importance")
plt.show()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X,plot_type="bar")
shap.summary_plot(shap_values, X)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[37,:], X.iloc[[32],:])
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X)